In [1]:
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
In [2]:
# Import CSV file
import warnings
warnings.filterwarnings ("ignore", category = UserWarning)
# Import the Pandas package.
import pandas as pd

# Create a variable named data_file_name to hold the name of the data file.
data_file_name = "spotify_songs.csv"

# Load the data file into a Pandas DataFrame.
df = pd.read_csv(data_file_name)


# Verify the DataFrame contents by printing the first 5 rows of data.
df.head(5)
Out[2]:
track_id track_name track_artist track_popularity track_album_id track_album_name track_album_release_date playlist_name playlist_id playlist_genre ... key loudness mode speechiness acousticness instrumentalness liveness valence tempo duration_ms
0 6f807x0ima9a1j3VPbc7VN I Don't Care (with Justin Bieber) - Loud Luxur... Ed Sheeran 66 2oCs0DGTsRO98Gh5ZSl2Cx I Don't Care (with Justin Bieber) [Loud Luxury... 2019-06-14 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... 6 -2.634 1 0.0583 0.1020 0.000000 0.0653 0.518 122.036 194754
1 0r7CVbZTWZgbTCYdfa2P31 Memories - Dillon Francis Remix Maroon 5 67 63rPSO264uRjW1X5E6cWv6 Memories (Dillon Francis Remix) 2019-12-13 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... 11 -4.969 1 0.0373 0.0724 0.004210 0.3570 0.693 99.972 162600
2 1z1Hg7Vb0AhHDiEmnDE79l All the Time - Don Diablo Remix Zara Larsson 70 1HoSmj2eLcsrR0vE9gThr4 All the Time (Don Diablo Remix) 2019-07-05 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... 1 -3.432 0 0.0742 0.0794 0.000023 0.1100 0.613 124.008 176616
3 75FpbthrwQmzHlBJLuGdC7 Call You Mine - Keanu Silva Remix The Chainsmokers 60 1nqYsOef1yKKuGOVchbsk6 Call You Mine - The Remixes 2019-07-19 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... 7 -3.778 1 0.1020 0.0287 0.000009 0.2040 0.277 121.956 169093
4 1e8PAfcKUYoKkxPhrHqw4x Someone You Loved - Future Humans Remix Lewis Capaldi 69 7m7vv9wlQ4i0LFuJiE2zsQ Someone You Loved (Future Humans Remix) 2019-03-05 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... 1 -4.672 1 0.0359 0.0803 0.000000 0.0833 0.725 123.976 189052

5 rows × 23 columns

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32833 entries, 0 to 32832
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  32833 non-null  object 
 1   track_name                32828 non-null  object 
 2   track_artist              32828 non-null  object 
 3   track_popularity          32833 non-null  int64  
 4   track_album_id            32833 non-null  object 
 5   track_album_name          32828 non-null  object 
 6   track_album_release_date  32833 non-null  object 
 7   playlist_name             32833 non-null  object 
 8   playlist_id               32833 non-null  object 
 9   playlist_genre            32833 non-null  object 
 10  playlist_subgenre         32833 non-null  object 
 11  danceability              32833 non-null  float64
 12  energy                    32833 non-null  float64
 13  key                       32833 non-null  int64  
 14  loudness                  32833 non-null  float64
 15  mode                      32833 non-null  int64  
 16  speechiness               32833 non-null  float64
 17  acousticness              32833 non-null  float64
 18  instrumentalness          32833 non-null  float64
 19  liveness                  32833 non-null  float64
 20  valence                   32833 non-null  float64
 21  tempo                     32833 non-null  float64
 22  duration_ms               32833 non-null  int64  
dtypes: float64(9), int64(4), object(10)
memory usage: 5.8+ MB
In [4]:
# Shape of the dataset
shape = df.shape
print(shape)

unique_rows = df["track_id"].unique().shape[0]
print(unique_rows)
(32833, 23)
28356
In [5]:
# Number of null values in each column
df.isnull().sum()
Out[5]:
track_id                    0
track_name                  5
track_artist                5
track_popularity            0
track_album_id              0
track_album_name            5
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
dtype: int64
In [6]:
# Save Column names
col_names = df.columns
In [7]:
# Discretize Popularity of each song
# create blank column of zeroes
col = np.zeros((shape[0],1))
df_popularity_category = pd.DataFrame(col)
print(type(df_popularity_category))

# concatenate the dataframes
df = pd.concat([df,df_popularity_category], axis = 1)

df.head(5)
<class 'pandas.core.frame.DataFrame'>
Out[7]:
track_id track_name track_artist track_popularity track_album_id track_album_name track_album_release_date playlist_name playlist_id playlist_genre ... loudness mode speechiness acousticness instrumentalness liveness valence tempo duration_ms 0
0 6f807x0ima9a1j3VPbc7VN I Don't Care (with Justin Bieber) - Loud Luxur... Ed Sheeran 66 2oCs0DGTsRO98Gh5ZSl2Cx I Don't Care (with Justin Bieber) [Loud Luxury... 2019-06-14 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... -2.634 1 0.0583 0.1020 0.000000 0.0653 0.518 122.036 194754 0.0
1 0r7CVbZTWZgbTCYdfa2P31 Memories - Dillon Francis Remix Maroon 5 67 63rPSO264uRjW1X5E6cWv6 Memories (Dillon Francis Remix) 2019-12-13 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... -4.969 1 0.0373 0.0724 0.004210 0.3570 0.693 99.972 162600 0.0
2 1z1Hg7Vb0AhHDiEmnDE79l All the Time - Don Diablo Remix Zara Larsson 70 1HoSmj2eLcsrR0vE9gThr4 All the Time (Don Diablo Remix) 2019-07-05 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... -3.432 0 0.0742 0.0794 0.000023 0.1100 0.613 124.008 176616 0.0
3 75FpbthrwQmzHlBJLuGdC7 Call You Mine - Keanu Silva Remix The Chainsmokers 60 1nqYsOef1yKKuGOVchbsk6 Call You Mine - The Remixes 2019-07-19 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... -3.778 1 0.1020 0.0287 0.000009 0.2040 0.277 121.956 169093 0.0
4 1e8PAfcKUYoKkxPhrHqw4x Someone You Loved - Future Humans Remix Lewis Capaldi 69 7m7vv9wlQ4i0LFuJiE2zsQ Someone You Loved (Future Humans Remix) 2019-03-05 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... -4.672 1 0.0359 0.0803 0.000000 0.0833 0.725 123.976 189052 0.0

5 rows × 24 columns

In [8]:
median_popularity = df["track_popularity"].median()
df["popularity_category"] = (df["track_popularity"] > median_popularity).astype(int)
df.head()
Out[8]:
track_id track_name track_artist track_popularity track_album_id track_album_name track_album_release_date playlist_name playlist_id playlist_genre ... mode speechiness acousticness instrumentalness liveness valence tempo duration_ms 0 popularity_category
0 6f807x0ima9a1j3VPbc7VN I Don't Care (with Justin Bieber) - Loud Luxur... Ed Sheeran 66 2oCs0DGTsRO98Gh5ZSl2Cx I Don't Care (with Justin Bieber) [Loud Luxury... 2019-06-14 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... 1 0.0583 0.1020 0.000000 0.0653 0.518 122.036 194754 0.0 1
1 0r7CVbZTWZgbTCYdfa2P31 Memories - Dillon Francis Remix Maroon 5 67 63rPSO264uRjW1X5E6cWv6 Memories (Dillon Francis Remix) 2019-12-13 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... 1 0.0373 0.0724 0.004210 0.3570 0.693 99.972 162600 0.0 1
2 1z1Hg7Vb0AhHDiEmnDE79l All the Time - Don Diablo Remix Zara Larsson 70 1HoSmj2eLcsrR0vE9gThr4 All the Time (Don Diablo Remix) 2019-07-05 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... 0 0.0742 0.0794 0.000023 0.1100 0.613 124.008 176616 0.0 1
3 75FpbthrwQmzHlBJLuGdC7 Call You Mine - Keanu Silva Remix The Chainsmokers 60 1nqYsOef1yKKuGOVchbsk6 Call You Mine - The Remixes 2019-07-19 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... 1 0.1020 0.0287 0.000009 0.2040 0.277 121.956 169093 0.0 1
4 1e8PAfcKUYoKkxPhrHqw4x Someone You Loved - Future Humans Remix Lewis Capaldi 69 7m7vv9wlQ4i0LFuJiE2zsQ Someone You Loved (Future Humans Remix) 2019-03-05 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop ... 1 0.0359 0.0803 0.000000 0.0833 0.725 123.976 189052 0.0 1

5 rows × 25 columns

In [9]:
# Start Data Preprocessing

# Find features to remove:
# Don't need playlist data: playlist_name, playlist_id, playlist_genre, playlist_subgenre

featuresToRemove = []
featuresToRemove.append("playlist_name")
featuresToRemove.append("playlist_id")

for featureToRemove in featuresToRemove:
    del df[featureToRemove]
In [10]:
 df.head(5)
Out[10]:
track_id track_name track_artist track_popularity track_album_id track_album_name track_album_release_date playlist_genre playlist_subgenre danceability ... mode speechiness acousticness instrumentalness liveness valence tempo duration_ms 0 popularity_category
0 6f807x0ima9a1j3VPbc7VN I Don't Care (with Justin Bieber) - Loud Luxur... Ed Sheeran 66 2oCs0DGTsRO98Gh5ZSl2Cx I Don't Care (with Justin Bieber) [Loud Luxury... 2019-06-14 pop dance pop 0.748 ... 1 0.0583 0.1020 0.000000 0.0653 0.518 122.036 194754 0.0 1
1 0r7CVbZTWZgbTCYdfa2P31 Memories - Dillon Francis Remix Maroon 5 67 63rPSO264uRjW1X5E6cWv6 Memories (Dillon Francis Remix) 2019-12-13 pop dance pop 0.726 ... 1 0.0373 0.0724 0.004210 0.3570 0.693 99.972 162600 0.0 1
2 1z1Hg7Vb0AhHDiEmnDE79l All the Time - Don Diablo Remix Zara Larsson 70 1HoSmj2eLcsrR0vE9gThr4 All the Time (Don Diablo Remix) 2019-07-05 pop dance pop 0.675 ... 0 0.0742 0.0794 0.000023 0.1100 0.613 124.008 176616 0.0 1
3 75FpbthrwQmzHlBJLuGdC7 Call You Mine - Keanu Silva Remix The Chainsmokers 60 1nqYsOef1yKKuGOVchbsk6 Call You Mine - The Remixes 2019-07-19 pop dance pop 0.718 ... 1 0.1020 0.0287 0.000009 0.2040 0.277 121.956 169093 0.0 1
4 1e8PAfcKUYoKkxPhrHqw4x Someone You Loved - Future Humans Remix Lewis Capaldi 69 7m7vv9wlQ4i0LFuJiE2zsQ Someone You Loved (Future Humans Remix) 2019-03-05 pop dance pop 0.650 ... 1 0.0359 0.0803 0.000000 0.0833 0.725 123.976 189052 0.0 1

5 rows × 23 columns

In [11]:
# find the number of duplicate track IDs
ids = df["track_id"]

id_counts = ids.value_counts()
duplicated_ids = id_counts[id_counts > 1]
duplicated_track_ids = duplicated_ids.index

# now find full rows of duplicated IDs
condition = df["track_id"].isin(duplicated_track_ids)
duplicated_rows = df[condition]
duplicated_rows = duplicated_rows.sort_values(by = "track_id")
duplicated_rows.head(60)
Out[11]:
track_id track_name track_artist track_popularity track_album_id track_album_name track_album_release_date playlist_genre playlist_subgenre danceability ... mode speechiness acousticness instrumentalness liveness valence tempo duration_ms 0 popularity_category
32084 00Gu3RMpDW2vO9PjlMVFDL Hide Away (feat. Envy Monroe) Blasterjaxx 42 5pqG85igfoeWcCDIsSi9x7 Hide Away (feat. Envy Monroe) 2019-06-21 edm progressive electro house 0.573 ... 1 0.0421 0.02490 0.000000 0.3610 0.134 130.001 188000 0.0 0
28696 00Gu3RMpDW2vO9PjlMVFDL Hide Away (feat. Envy Monroe) Blasterjaxx 42 5pqG85igfoeWcCDIsSi9x7 Hide Away (feat. Envy Monroe) 2019-06-21 edm big room 0.573 ... 1 0.0421 0.02490 0.000000 0.3610 0.134 130.001 188000 0.0 0
23850 00QyLmjxaSEE8qIZQjBXBj We Own It (Fast & Furious) 2 Chainz 59 1jg2UPoSAr7CDPsEXcabo1 Fast & Furious 6 2013-01-01 r&b hip pop 0.554 ... 1 0.4080 0.05210 0.000000 0.0568 0.552 171.966 227893 0.0 1
28968 00QyLmjxaSEE8qIZQjBXBj We Own It (Fast & Furious) 2 Chainz 59 1jg2UPoSAr7CDPsEXcabo1 Fast & Furious 6 2013-01-01 edm big room 0.554 ... 1 0.4080 0.05210 0.000000 0.0568 0.552 171.966 227893 0.0 1
9387 00QyLmjxaSEE8qIZQjBXBj We Own It (Fast & Furious) 2 Chainz 59 1jg2UPoSAr7CDPsEXcabo1 Fast & Furious 6 2013-01-01 rap gangster rap 0.554 ... 1 0.4080 0.05210 0.000000 0.0568 0.552 171.966 227893 0.0 1
7853 00ReeHCY0FQUyuAUyPJdnk Ain't No Future In Yo' Frontin' MC Breed 48 7mLks5uEIPmT0056mb5oV3 MC Breed & DFC 1991-01-01 rap southern hip hop 0.672 ... 1 0.2480 0.05140 0.000000 0.4740 0.731 102.192 244733 0.0 1
9345 00ReeHCY0FQUyuAUyPJdnk Ain't No Future In Yo' Frontin' MC Breed 48 7mLks5uEIPmT0056mb5oV3 MC Breed & DFC 1991-01-01 rap gangster rap 0.672 ... 1 0.2480 0.05140 0.000000 0.4740 0.731 102.192 244733 0.0 1
3124 00WIXhVVhswHuS6dlkScuw Hot Confetti 51 6nsXqX8wZbkiqSKmSqxsuT Hot 2019-11-22 pop electropop 0.607 ... 1 0.1070 0.00297 0.000002 0.1200 0.664 168.015 150714 0.0 1
14626 00WIXhVVhswHuS6dlkScuw Hot Confetti 51 6nsXqX8wZbkiqSKmSqxsuT Hot 2019-11-22 rock permanent wave 0.607 ... 1 0.1070 0.00297 0.000002 0.1200 0.664 168.015 150714 0.0 1
18487 00i0O74dXdaKKdCrqHnfXm La Mordidita Ricky Martin 69 375cUd86z58eqXN2yW3Do9 A Quien Quiera Escuchar (Deluxe Edition) 2015-02-10 latin latin pop 0.725 ... 1 0.0658 0.03440 0.000000 0.1830 0.946 142.006 211680 0.0 1
21198 00i0O74dXdaKKdCrqHnfXm La Mordidita Ricky Martin 69 375cUd86z58eqXN2yW3Do9 A Quien Quiera Escuchar (Deluxe Edition) 2015-02-10 latin latin hip hop 0.725 ... 1 0.0658 0.03440 0.000000 0.1830 0.946 142.006 211680 0.0 1
1903 00qOE7OjRl0BpYiCiweZB2 Juke Box Hero Foreigner 67 2Pw51hAGvWpTA3AYl2WVuu 4 (Expanded) 1981 pop post-teen pop 0.357 ... 1 0.0654 0.08280 0.000000 0.0844 0.522 176.647 259800 0.0 1
12529 00qOE7OjRl0BpYiCiweZB2 Juke Box Hero Foreigner 67 2Pw51hAGvWpTA3AYl2WVuu 4 (Expanded) 1981 rock classic rock 0.357 ... 1 0.0654 0.08280 0.000000 0.0844 0.522 176.647 259800 0.0 1
15244 01R0Xdwje645C6xFCnMRvm Talk Dirty To Me Poison 54 0xOBnypzEh4WKROJ51LL09 Look What The Cat Dragged In 1986-01-01 rock hard rock 0.507 ... 1 0.0440 0.02110 0.002480 0.3810 0.708 157.996 223960 0.0 1
12979 01R0Xdwje645C6xFCnMRvm Talk Dirty To Me Poison 54 0xOBnypzEh4WKROJ51LL09 Look What The Cat Dragged In 1986-01-01 rock classic rock 0.507 ... 1 0.0440 0.02110 0.002480 0.3810 0.708 157.996 223960 0.0 1
121 01iyINEYgPQ7ThMZuHUsqS First Love Lost Kings 58 7syMmofF2t1xI0RFCtrSG9 First Love 2017-10-13 pop dance pop 0.619 ... 1 0.3500 0.02570 0.000014 0.1280 0.601 94.380 207428 0.0 1
17491 01iyINEYgPQ7ThMZuHUsqS First Love Lost Kings 58 7syMmofF2t1xI0RFCtrSG9 First Love 2017-10-13 latin tropical 0.619 ... 1 0.3500 0.02570 0.000014 0.1280 0.601 94.380 207428 0.0 1
12828 02138lFv3Bzncr6ScNbLAF Rattle Your Cage Skrizzly Adams 44 3RcttHMKlZ7K1ovxIANPd0 Young Man 2019-11-15 rock classic rock 0.583 ... 1 0.1230 0.02700 0.000000 0.0908 0.498 179.874 209972 0.0 0
15159 02138lFv3Bzncr6ScNbLAF Rattle Your Cage Skrizzly Adams 44 3RcttHMKlZ7K1ovxIANPd0 Young Man 2019-11-15 rock hard rock 0.583 ... 1 0.1230 0.02700 0.000000 0.0908 0.498 179.874 209972 0.0 0
4953 0240T0gP9w6xEgIciBrfVF Talk Is Cheap Nick Murphy / Chet Faker 61 2ytxqdwQ0Hn9JeQmcIWHuh Built on Glass 2014-04-14 pop indie poptimism 0.656 ... 0 0.1710 0.41600 0.000172 0.2450 0.520 140.058 218067 0.0 1
26173 0240T0gP9w6xEgIciBrfVF Talk Is Cheap Nick Murphy / Chet Faker 61 2ytxqdwQ0Hn9JeQmcIWHuh Built on Glass 2014-04-14 r&b neo soul 0.656 ... 0 0.1710 0.41600 0.000172 0.2450 0.520 140.058 218067 0.0 1
4169 02CygBCQOIyEuhNZqHHcNx It Runs Through Me Tom Misch 67 28enuddLPEA914scE6Drvk Geography 2018-04-06 pop indie poptimism 0.802 ... 0 0.2890 0.20500 0.000748 0.2460 0.274 96.916 261881 0.0 1
25425 02CygBCQOIyEuhNZqHHcNx It Runs Through Me Tom Misch 67 28enuddLPEA914scE6Drvk Geography 2018-04-06 r&b neo soul 0.802 ... 0 0.2890 0.20500 0.000748 0.2460 0.274 96.916 261881 0.0 1
2703 02M6vucOvmRfMxTXDUwRXu 7/11 Beyoncé 71 2UJwKSBUz6rtW4QLK74kQu BEYONCÉ [Platinum Edition] 2014-11-24 pop electropop 0.747 ... 0 0.1260 0.01280 0.000000 0.1260 0.560 136.024 213507 0.0 1
22268 02M6vucOvmRfMxTXDUwRXu 7/11 Beyoncé 71 2UJwKSBUz6rtW4QLK74kQu BEYONCÉ [Platinum Edition] 2014-11-24 r&b urban contemporary 0.747 ... 0 0.1260 0.01280 0.000000 0.1260 0.560 136.024 213507 0.0 1
7325 02M6vucOvmRfMxTXDUwRXu 7/11 Beyoncé 71 2UJwKSBUz6rtW4QLK74kQu BEYONCÉ [Platinum Edition] 2014-11-24 rap southern hip hop 0.747 ... 0 0.1260 0.01280 0.000000 0.1260 0.560 136.024 213507 0.0 1
23061 02M6vucOvmRfMxTXDUwRXu 7/11 Beyoncé 71 2UJwKSBUz6rtW4QLK74kQu BEYONCÉ [Platinum Edition] 2014-11-24 r&b hip pop 0.747 ... 0 0.1260 0.01280 0.000000 0.1260 0.560 136.024 213507 0.0 1
214 02itaCXOdC54J0ISjqqFAp All Around The World (La La La) R3HAB 80 0Y59j5oCvwTM2aNyPb6YpJ All Around The World (La La La) 2019-04-05 pop dance pop 0.733 ... 0 0.0330 0.47900 0.064500 0.1050 0.520 124.948 147840 0.0 1
29639 02itaCXOdC54J0ISjqqFAp All Around The World (La La La) R3HAB 80 0Y59j5oCvwTM2aNyPb6YpJ All Around The World (La La La) 2019-04-05 edm pop edm 0.733 ... 0 0.0330 0.47900 0.064500 0.1050 0.520 124.948 147840 0.0 1
18896 02lGHA7bFFplYLihnUWTx8 Tequila Juanes 77 2X45SzRfAFsxgkBxgryWfF Más Futuro Que Pasado 2019-11-22 latin reggaeton 0.757 ... 1 0.1120 0.22600 0.000000 0.2370 0.704 155.994 159547 0.0 1
17611 02lGHA7bFFplYLihnUWTx8 Tequila Juanes 77 2X45SzRfAFsxgkBxgryWfF Más Futuro Que Pasado 2019-11-22 latin latin pop 0.757 ... 1 0.1120 0.22600 0.000000 0.2370 0.704 155.994 159547 0.0 1
27762 02q7qbOYbE89NMFEtOklcc Unity Dimitri Vegas & Like Mike 66 5mnKVK3cksHH5Lzm1OZpiN Unity 2018-07-20 edm electro house 0.650 ... 0 0.0406 0.02410 0.800000 0.2930 0.191 129.999 234462 0.0 1
28409 02q7qbOYbE89NMFEtOklcc Unity Dimitri Vegas & Like Mike 66 5mnKVK3cksHH5Lzm1OZpiN Unity 2018-07-20 edm big room 0.650 ... 0 0.0406 0.02410 0.800000 0.2930 0.191 129.999 234462 0.0 1
31530 02q7qbOYbE89NMFEtOklcc Unity Dimitri Vegas & Like Mike 66 5mnKVK3cksHH5Lzm1OZpiN Unity 2018-07-20 edm progressive electro house 0.650 ... 0 0.0406 0.02410 0.800000 0.2930 0.191 129.999 234462 0.0 1
31042 037yW9RzsLze4OmBYmcH4G Save My Night - BlasterJaxx Remix Armin van Buuren 16 16AD0yLkUKcp2FBtXKOE9j Save My Night (BlasterJaxx Remix) 2014-03-10 edm progressive electro house 0.686 ... 1 0.0475 0.00196 0.745000 0.0520 0.165 129.972 326827 0.0 0
27979 037yW9RzsLze4OmBYmcH4G Save My Night - BlasterJaxx Remix Armin van Buuren 16 16AD0yLkUKcp2FBtXKOE9j Save My Night (BlasterJaxx Remix) 2014-03-10 edm electro house 0.686 ... 1 0.0475 0.00196 0.745000 0.0520 0.165 129.972 326827 0.0 0
26909 03m9WRVBzoxyTeKblvLvpR You Got The Love Keanu Silva 62 14B4NCJRvKpfyQBAGBCJT4 You Got The Love 2019-10-04 edm electro house 0.744 ... 1 0.0558 0.03940 0.000306 0.2950 0.700 124.046 164446 0.0 1
29965 03m9WRVBzoxyTeKblvLvpR You Got The Love Keanu Silva 62 14B4NCJRvKpfyQBAGBCJT4 You Got The Love 2019-10-04 edm pop edm 0.744 ... 1 0.0558 0.03940 0.000306 0.2950 0.700 124.046 164446 0.0 1
6012 03tqyYWC9Um2ZqU0ZN849H No Hands (feat. Roscoe Dash & Wale) Waka Flocka Flame 74 6MQtWELG7aRX7CkAzQ6nLM Flockaveli 2010-10-01 rap hip hop 0.760 ... 1 0.0391 0.00544 0.000000 0.2410 0.361 131.497 263773 0.0 1
7523 03tqyYWC9Um2ZqU0ZN849H No Hands (feat. Roscoe Dash & Wale) Waka Flocka Flame 74 6MQtWELG7aRX7CkAzQ6nLM Flockaveli 2010-10-01 rap southern hip hop 0.760 ... 1 0.0391 0.00544 0.000000 0.2410 0.361 131.497 263773 0.0 1
23363 04KTF78FFg8sOHC1BADqbY Hot In Herre Nelly 71 4HUUHHXBXImwksfbSPqE7q Nellyville 2002-06-25 r&b hip pop 0.956 ... 0 0.1200 0.20600 0.000000 0.0615 0.912 107.075 228240 0.0 1
7236 04KTF78FFg8sOHC1BADqbY Hot In Herre Nelly 71 4HUUHHXBXImwksfbSPqE7q Nellyville 2002-06-25 rap southern hip hop 0.956 ... 0 0.1200 0.20600 0.000000 0.0615 0.912 107.075 228240 0.0 1
6315 04MLEeAMuV9IlHEsD8vF6A No Stylist French Montana 76 0DMvfJWc1DjSbmnJF1NW1o No Stylist 2018-09-20 rap hip hop 0.765 ... 0 0.1270 0.02150 0.000000 0.2270 0.498 147.055 192172 0.0 1
9424 04MLEeAMuV9IlHEsD8vF6A No Stylist French Montana 76 0DMvfJWc1DjSbmnJF1NW1o No Stylist 2018-09-20 rap gangster rap 0.765 ... 0 0.1270 0.02150 0.000000 0.2270 0.498 147.055 192172 0.0 1
11228 04MLEeAMuV9IlHEsD8vF6A No Stylist French Montana 76 0DMvfJWc1DjSbmnJF1NW1o No Stylist 2018-09-20 rap trap 0.765 ... 0 0.1270 0.02150 0.000000 0.2270 0.498 147.055 192172 0.0 1
4683 04ZTP5KsCypmtCmQg5tH9R I'm a Mess Bebe Rexha 80 4TOkZvtqNpg5UHyGxCn0mS Expectations 2018-06-22 pop indie poptimism 0.630 ... 0 0.0253 0.00281 0.000000 0.0719 0.216 97.005 195519 0.0 1
23280 04ZTP5KsCypmtCmQg5tH9R I'm a Mess Bebe Rexha 80 4TOkZvtqNpg5UHyGxCn0mS Expectations 2018-06-22 r&b hip pop 0.630 ... 0 0.0253 0.00281 0.000000 0.0719 0.216 97.005 195519 0.0 1
1480 04ZTP5KsCypmtCmQg5tH9R I'm a Mess Bebe Rexha 80 4TOkZvtqNpg5UHyGxCn0mS Expectations 2018-06-22 pop post-teen pop 0.630 ... 0 0.0253 0.00281 0.000000 0.0719 0.216 97.005 195519 0.0 1
20659 04aAxqtGp5pv12UXAg4pkq Centuries Fall Out Boy 79 022DrG7Wp2PSCwzuD0bSzT American Beauty/American Psycho 2015-01-20 latin latin hip hop 0.394 ... 0 0.0729 0.00359 0.000000 0.1020 0.560 176.044 228360 0.0 1
1898 04aAxqtGp5pv12UXAg4pkq Centuries Fall Out Boy 79 022DrG7Wp2PSCwzuD0bSzT American Beauty/American Psycho 2015-01-20 pop post-teen pop 0.394 ... 0 0.0729 0.00359 0.000000 0.1020 0.560 176.044 228360 0.0 1
29979 04wllvXvWOkZS5NugzeS8O Lost Chris Burke 37 13FrOWDba1sGvu10ofSKFr Lost 2019-08-16 edm pop edm 0.616 ... 0 0.0397 0.12100 0.022800 0.0311 0.189 128.008 272124 0.0 0
27251 04wllvXvWOkZS5NugzeS8O Lost Chris Burke 37 13FrOWDba1sGvu10ofSKFr Lost 2019-08-16 edm electro house 0.616 ... 0 0.0397 0.12100 0.022800 0.0311 0.189 128.008 272124 0.0 0
30800 05CwHjIk71RXVU40boRMnR Call You Mine The Chainsmokers 39 1ONuDpN0a3zhCUyKCgtuzK World War Joy 2019-05-31 edm pop edm 0.591 ... 1 0.0289 0.22500 0.000000 0.4140 0.501 104.003 217653 0.0 0
70 05CwHjIk71RXVU40boRMnR Call You Mine The Chainsmokers 39 1ONuDpN0a3zhCUyKCgtuzK World War Joy 2019-05-31 pop dance pop 0.591 ... 1 0.0289 0.22500 0.000000 0.4140 0.501 104.003 217653 0.0 0
4648 05CwHjIk71RXVU40boRMnR Call You Mine The Chainsmokers 39 1ONuDpN0a3zhCUyKCgtuzK World War Joy 2019-05-31 pop indie poptimism 0.591 ... 1 0.0289 0.22500 0.000000 0.4140 0.501 104.003 217653 0.0 0
14910 05RgAMGypEvqhNs5hPCbMS Panama - 2015 Remaster Van Halen 73 3REUXdj5OPKhuDTrTtCBU0 1984 (Remastered) 1984-01-04 rock hard rock 0.527 ... 1 0.1090 0.00124 0.000048 0.0744 0.463 141.169 210227 0.0 1
13589 05RgAMGypEvqhNs5hPCbMS Panama - 2015 Remaster Van Halen 73 3REUXdj5OPKhuDTrTtCBU0 1984 (Remastered) 1984-01-04 rock classic rock 0.527 ... 1 0.1090 0.00124 0.000048 0.0744 0.463 141.169 210227 0.0 1
30873 05SBRd4fXgn8FX7bf8BCAE I Need Your Love (feat. Ellie Goulding) Calvin Harris 69 7w19PFbxAjwZ7UVNp9z0uT 18 Months 2012-10-26 edm pop edm 0.695 ... 1 0.0483 0.41000 0.000000 0.2370 0.580 124.989 234507 0.0 1
20737 05SBRd4fXgn8FX7bf8BCAE I Need Your Love (feat. Ellie Goulding) Calvin Harris 69 7w19PFbxAjwZ7UVNp9z0uT 18 Months 2012-10-26 latin latin hip hop 0.695 ... 1 0.0483 0.41000 0.000000 0.2370 0.580 124.989 234507 0.0 1
1584 05SBRd4fXgn8FX7bf8BCAE I Need Your Love (feat. Ellie Goulding) Calvin Harris 69 7w19PFbxAjwZ7UVNp9z0uT 18 Months 2012-10-26 pop post-teen pop 0.695 ... 1 0.0483 0.41000 0.000000 0.2370 0.580 124.989 234507 0.0 1

60 rows × 23 columns

In [12]:
# now we must drop the duplicates now that we've identified them
df = df.drop_duplicates(subset = "track_id", inplace = False)

# make sure unique rows match 
print(f"Unique rows - rows when duplicates dropped = {unique_rows - df.shape[0]}")
Unique rows - rows when duplicates dropped = 0
In [13]:
df.nunique()
Out[13]:
track_id                    28356
track_name                  23449
track_artist                10692
track_popularity              101
track_album_id              22545
track_album_name            19743
track_album_release_date     4530
playlist_genre                  6
playlist_subgenre              24
danceability                  822
energy                        952
key                            12
loudness                    10222
mode                            2
speechiness                  1270
acousticness                 3731
instrumentalness             4729
liveness                     1624
valence                      1362
tempo                       17684
duration_ms                 19785
0                               1
popularity_category             2
dtype: int64
In [14]:
df["mode"].unique()
Out[14]:
array([1, 0])
In [15]:
# figuring out how to parse album/song release dates
print(type(df["track_album_release_date"]))
print(df["track_album_release_date"][21][:4])
print(df["track_album_release_date"][21][5:7])
print(df["track_album_release_date"][21][8:10])
<class 'pandas.core.series.Series'>
2019
08
23
In [16]:
df["date_len"] = df["track_album_release_date"].astype(str).str.len()
len = df["date_len"]
year_only = len[len != 10]
year_only[year_only != 4]
Out[16]:
3446     7
3524     7
7614     7
11740    7
11877    7
12208    7
12283    7
12512    7
12538    7
12764    7
12789    7
13135    7
13227    7
13365    7
13395    7
13408    7
13480    7
13576    7
14504    7
15266    7
15408    7
15990    7
22473    7
22744    7
22759    7
Name: date_len, dtype: int64
In [71]:
# Since we have varying levels of date data we only keep year of release
df["Release Year"] = ""
for i in range(unique_rows):
    df.at[df.index[i], "Release Year"] = df["track_album_release_date"].iloc[i][:4]
    
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File /opt/anaconda3/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'track_album_release_date'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[71], line 4
      2 df["Release Year"] = ""
      3 for i in range(unique_rows):
----> 4     df.at[df.index[i], "Release Year"] = df["track_album_release_date"].iloc[i][:4]

File /opt/anaconda3/lib/python3.12/site-packages/pandas/core/frame.py:4102, in DataFrame.__getitem__(self, key)
   4100 if self.columns.nlevels > 1:
   4101     return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
   4103 if is_integer(indexer):
   4104     indexer = [indexer]

File /opt/anaconda3/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)

KeyError: 'track_album_release_date'
In [18]:
# Don't need the full date or date_len columns anymore
del df["track_album_release_date"]
del df["date_len"]
In [19]:
df.columns
Out[19]:
Index([           'track_id',          'track_name',        'track_artist',
          'track_popularity',      'track_album_id',    'track_album_name',
            'playlist_genre',   'playlist_subgenre',        'danceability',
                    'energy',                 'key',            'loudness',
                      'mode',         'speechiness',        'acousticness',
          'instrumentalness',            'liveness',             'valence',
                     'tempo',         'duration_ms',                     0,
       'popularity_category',        'Release Year'],
      dtype='object')
In [20]:
df = df.rename(columns = {'track_id': 'ID', 
                     'track_name' : 'Title',
                     'track_artist': 'artist',
                     'track_album_id': 'Album ID',
                     'track_album_name': 'Album',
                     'playlist_genre': 'genre',
                     'playlist_subgenre': 'subgenre',
                     'danceability': 'danceability',
                     'energy': 'energy',
                     'key': 'key',
                     'loudness': 'loudness',
                     'mode': 'mode',
                     'speechiness': 'speechiness',
                     'acousticness': 'acousticness',
                     'instrumentalness': 'instrumentalness',
                     'liveness': 'liveness',
                     'valence': 'valence',
                     'tempo': 'tempo',
                     'duration_ms': 'Song Length',
                     'track_popularity': 'Popularity',
                     'popularity_category': 'popularity_category',
                     'Release Year': 'Release Year'
                    })
                  
In [21]:
df = df[['ID',
         'Title',
         'artist',
         'Album ID',
         'Album',
         'genre',
         'subgenre',
         'danceability',
         'energy',
         'key',
         'loudness',
         'mode',
         'speechiness',
         'acousticness',
         'instrumentalness', 
         'liveness',
         'valence',
         'tempo',
         'Song Length',
         'Popularity',
         'popularity_category',
         'Release Year']]
In [22]:
df.describe()
Out[22]:
danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo Song Length Popularity popularity_category
count 28356.000000 28356.000000 28356.000000 28356.000000 28356.000000 28356.000000 28356.000000 28356.000000 28356.000000 28356.000000 28356.00000 28356.000000 28356.000000 28356.000000
mean 0.653372 0.698388 5.368000 -6.817696 0.565489 0.107954 0.177176 0.091117 0.190958 0.510387 120.95618 226575.967026 39.329771 0.449922
std 0.145785 0.183503 3.613904 3.036243 0.495701 0.102556 0.222803 0.232548 0.155894 0.234340 26.95456 61078.450819 23.702376 0.497495
min 0.000000 0.000175 0.000000 -46.448000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 4000.000000 0.000000 0.000000
25% 0.561000 0.579000 2.000000 -8.309250 0.000000 0.041000 0.014375 0.000000 0.092600 0.329000 99.97200 187742.000000 21.000000 0.000000
50% 0.670000 0.722000 6.000000 -6.261000 1.000000 0.062600 0.079700 0.000021 0.127000 0.512000 121.99300 216933.000000 42.000000 0.000000
75% 0.760000 0.843000 9.000000 -4.709000 1.000000 0.133000 0.260000 0.006570 0.249000 0.695000 133.99900 254975.250000 58.000000 1.000000
max 0.983000 1.000000 11.000000 1.275000 1.000000 0.918000 0.994000 0.994000 0.996000 0.991000 239.44000 517810.000000 100.000000 1.000000
In [23]:
df_num = df[['danceability',
             'energy',
             'key',
             'loudness',
             'mode',
             'speechiness',
             'acousticness',
             'instrumentalness',
             'liveness',
             'valence',
             'tempo',
             'Song Length',
             'Popularity']]
In [24]:
# Check for collinearity of numeric data
correlations = df_num.corr()


plt.figure(figsize = (12,10))
sns.heatmap(correlations, annot = True, cmap = "BuPu")
plt.show()
No description has been provided for this image
In [25]:
sns.pairplot(df_num)
Out[25]:
<seaborn.axisgrid.PairGrid at 0x122ab89e0>
No description has been provided for this image
In [26]:
# Remove all NaN data
df.dropna(inplace = True)
In [27]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 28352 entries, 0 to 32832
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   28352 non-null  object 
 1   Title                28352 non-null  object 
 2   artist               28352 non-null  object 
 3   Album ID             28352 non-null  object 
 4   Album                28352 non-null  object 
 5   genre                28352 non-null  object 
 6   subgenre             28352 non-null  object 
 7   danceability         28352 non-null  float64
 8   energy               28352 non-null  float64
 9   key                  28352 non-null  int64  
 10  loudness             28352 non-null  float64
 11  mode                 28352 non-null  int64  
 12  speechiness          28352 non-null  float64
 13  acousticness         28352 non-null  float64
 14  instrumentalness     28352 non-null  float64
 15  liveness             28352 non-null  float64
 16  valence              28352 non-null  float64
 17  tempo                28352 non-null  float64
 18  Song Length          28352 non-null  int64  
 19  Popularity           28352 non-null  int64  
 20  popularity_category  28352 non-null  int64  
 21  Release Year         28352 non-null  object 
dtypes: float64(9), int64(5), object(8)
memory usage: 5.0+ MB
In [28]:
# Data is now ready for analysis
In [29]:
artist_counts = df["artist"].value_counts()
famous = artist_counts[artist_counts > 50]
famous.index
Out[29]:
Index(['Queen', 'Martin Garrix', 'Don Omar', 'David Guetta',
       'Dimitri Vegas & Like Mike', 'Drake', 'Hardwell', 'The Chainsmokers',
       'Logic', 'Guns N' Roses', '2Pac', 'The Weeknd', 'Wisin & Yandel'],
      dtype='object', name='artist')
In [30]:
def artist_plot(artist, df):
    artist_name = artist  
    filtered_data = df[df['artist'] == artist_name]

    # Scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(filtered_data['Release Year'], filtered_data['Popularity'], color='blue')
    plt.title(f'{artist_name} - Popularity Score of Songs')
    plt.xlabel('Release Year')
    plt.ylabel('Popularity Score')
    plt.xticks(rotation=90)  # Rotate song names for readability
    plt.tight_layout()  # Adjust layout to prevent clipping of labels
    plt.show()
In [31]:
artist_plot("Queen", df)
No description has been provided for this image
In [32]:
artist_plot("Drake", df)
No description has been provided for this image
In [33]:
def test_artist_affect(df):
   
    # Group the data by artist and extract popularity scores for each artist
    grouped_data = [group['Popularity'].values for _, group in df.groupby('artist')]

    # Perform ANOVA test
    f_statistic, p_value = stats.f_oneway(*grouped_data)

    # Output the results
    print(f'F-statistic: {f_statistic}')
    print(f'P-value: {p_value}')
    
    # Interpret the results
    if p_value < 0.05:
        print('There is significant evidence that the artist affects the popularity score.')
    else:
        print('There is no significant evidence that the artist affects the popularity score.')


test_artist_affect(df)
F-statistic: 1.9805636759344931
P-value: 0.0
There is significant evidence that the artist affects the popularity score.
In [34]:
artist_plot("The Weeknd", df)
No description has been provided for this image
In [35]:
plt.figure(figsize=(12, 8))

# Create a violin plot to show the distribution of 'Popularity' for each 'Release Year' in sorted order
sns.violinplot(x='Release Year', y='Popularity', data=df, order=sorted(df['Release Year'].unique()))

plt.title('Distribution of Popularity Scores by Release Year')
plt.xlabel('Release Year')
plt.ylabel('Popularity')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
No description has been provided for this image
In [36]:
# Create a DF called y with all rows and just the target column
# Reshape your into a 1-dimensional NumPy array object
df_log_reg = df[[ "danceability",
         "energy",
         "key",
         "loudness",
         "mode",
         "speechiness",
         "acousticness",
         "instrumentalness",
         "liveness",
         "valence",
         "tempo",
         "Song Length",
         "popularity_category"]]

y = df_log_reg.iloc[:, -1:].values.reshape(-1, )
print("y:", y.shape)
print(type(y))
print()

# create a DF called X with all rows and all columns before target
X = df_log_reg.iloc[:, :-1]
print("X shape: ", X.shape)
print(type(X))
y: (28352,)
<class 'numpy.ndarray'>

X shape:  (28352, 12)
<class 'pandas.core.frame.DataFrame'>
In [37]:
# Imports
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# create an instance of sk;earns LogisticRegression
model = LogisticRegression(max_iter = 200, multi_class = 'multinomial', solver = 'lbfgs')

# remove intercept column from independent variables in X
X = X.iloc[:, :-1]
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns = X.columns)
print(X_scaled)
print()
       danceability    energy       key  loudness      mode  speechiness  \
0          0.649057  1.185948  0.175052  1.377883  0.876494    -0.484071   
1          0.498153  0.635555  1.558684  0.608875  0.876494    -0.688858   
2          0.148330  1.267690 -1.208579  1.115070 -1.140909    -0.329017   
3          0.443279  1.262240  0.451779  1.001119  0.876494    -0.057918   
4         -0.023151  0.733645 -1.208579  0.706689  0.876494    -0.702510   
...             ...       ...       ...       ...       ...          ...   
28347     -1.545909  1.218645 -0.931853  1.647942  0.876494    -0.139833   
28348     -0.901138  0.477521 -1.485305  0.775851  0.876494    -0.643025   
28349     -0.853123  0.668251  0.175052  0.631929 -1.140909    -0.583539   
28350     -0.187774  1.033364 -0.931853  1.138453  0.876494     0.010344   
28351     -0.345537  1.011566 -0.101674  0.739953 -1.140909    -0.677156   

       acousticness  instrumentalness  liveness   valence     tempo  
0         -0.337471         -0.391856 -0.806073  0.032493  0.039986  
1         -0.470319         -0.373753  1.065178  0.779272 -0.778593  
2         -0.438902         -0.391756 -0.519323  0.437888  0.113148  
3         -0.666450         -0.391816  0.083685 -0.995928  0.037018  
4         -0.434863         -0.391856 -0.690604  0.915826  0.111960  
...             ...               ...       ...       ...       ...  
28347     -0.451469         -0.391856 -0.796451 -1.281838  0.267559  
28348     -0.787584         -0.373495  1.180648 -0.471049  0.262773  
28349     -0.310542         -0.391852 -0.262724 -0.317426  0.260843  
28350     -0.759713          0.154243  0.975368 -0.863642  0.261548  
28351     -0.794662          1.074443  3.534947 -1.796475  0.260658  

[28352 rows x 11 columns]

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 10, shuffle = True)# create cross-tabulation and look at the percentage of each category for the target variable
count_train = pd.crosstab(index = y_train, columns = "Percent")

print("Training Target Variable:")
print()
print(count_train / count_train.sum())
print()

count_test = pd.crosstab(index = y_test, columns = "Percent")

print("Testing Target Variable:")
print()
print(count_test / count_test.sum())
print()
Training Target Variable:

col_0   Percent
row_0          
0      0.547969
1      0.452031

Testing Target Variable:

col_0   Percent
row_0          
0      0.554785
1      0.445215

In [39]:
# Fit the model to the training data.
model.fit(X_train, y_train)



# Output the y-intercept and regression coefficients.
print("Intercept :" + str(model.intercept_))
print("Coefficients: " + str(model.coef_))
print()


# Create a new DataFrame called df_results.
# Add the predictor names and coefficient values to df_results.
# Take the exponents of the coefficients to create the odds ratios.
# Add the odds ratio values to df_results.
my_dictionary = {
    "predictor" : X_train.columns,
    "coefficient": model.coef_[0]
}
df_results = pd.DataFrame(my_dictionary)

df_results["odds_ratios"] = np.exp(df_results["coefficient"])


# Sort the results by odds ratio descending.
# Output the results.

df_results = df_results.sort_values("odds_ratios", ascending = False)
df_results
Intercept :[-0.10243594]
Coefficients: [[ 0.02152963 -0.17891547  0.00335629  0.1622179   0.01531719 -0.0129159
   0.04643728 -0.08183558 -0.01856358  0.02952272  0.03665443]]

Out[39]:
predictor coefficient odds_ratios
3 loudness 0.162218 1.176116
6 acousticness 0.046437 1.047532
10 tempo 0.036654 1.037334
9 valence 0.029523 1.029963
0 danceability 0.021530 1.021763
4 mode 0.015317 1.015435
2 key 0.003356 1.003362
5 speechiness -0.012916 0.987167
8 liveness -0.018564 0.981608
7 instrumentalness -0.081836 0.921423
1 energy -0.178915 0.836177
In [40]:
y_predictions = model.predict(X_test)


# Compare y_test to y_predictions by creating a new DataFrame called df_compare with y_test in one (1) column and y_predictions in another column.
data = {
    "actual": y_test,
    "predicted": y_predictions
}
df_compare = pd.DataFrame(data)
df_compare.head(10)
Out[40]:
actual predicted
0 1 1
1 0 0
2 0 1
3 1 1
4 1 0
5 0 0
6 1 0
7 0 1
8 1 0
9 1 0
In [41]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_predictions))
              precision    recall  f1-score   support

           0       0.60      0.75      0.67      4719
           1       0.54      0.36      0.43      3787

    accuracy                           0.58      8506
   macro avg       0.57      0.56      0.55      8506
weighted avg       0.57      0.58      0.56      8506

In [42]:
df['loud_minus_energy'] = df['loudness'] - df['energy']
df['acoustic_vs_instr'] = df['acousticness'] - df['instrumentalness']
df['loud_acoustic_product'] = df['loudness'] * df['acousticness']
df['inv_energy'] = 1 / (df['energy'] + 1e-5)
df['valence_tempo'] = df['valence'] * df['tempo']

df_new = df[['loud_minus_energy','acoustic_vs_instr','loud_acoustic_product','inv_energy','valence_tempo','key','danceability','speechiness','mode','Song Length','popularity_category']]
In [43]:
model2 = LogisticRegression(max_iter = 200, multi_class = 'multinomial', solver = 'lbfgs')
In [44]:
y = df_new.iloc[:, -1:].values.reshape(-1, )
print("y:", y.shape)
print(type(y))
print()

# create a DF called X with all rows and all columns before target
X = df_new.iloc[:, :-1]
print("X shape: ", X.shape)
print(type(X))
y: (28352,)
<class 'numpy.ndarray'>

X shape:  (28352, 10)
<class 'pandas.core.frame.DataFrame'>
In [45]:
X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns = X.columns)
print(X_scaled)
print()
       loud_minus_energy  acoustic_vs_instr  loud_acoustic_product  \
0               1.360932           0.049407               0.484794   
1               0.594364          -0.055407               0.447710   
2               1.081962          -0.020727               0.483233   
3               0.963579          -0.177859               0.550030   
4               0.690099          -0.017865               0.441438   
...                  ...                ...                    ...   
28347           1.640245          -0.029335               0.537604   
28348           0.778286          -0.274738               0.591067   
28349           0.616325           0.068005               0.378770   
28350           1.121079          -0.635961               0.583337   
28351           0.707256          -1.323521               0.593926   

       inv_energy  valence_tempo       key  danceability  speechiness  \
0       -0.022582       0.051058  0.175052      0.649057    -0.484071   
1       -0.018370       0.240196  1.558684      0.498153    -0.688858   
2       -0.023130       0.450236 -1.208579      0.148330    -0.329017   
3       -0.023094      -0.866664  0.451779      0.443279    -0.057918   
4       -0.019196       0.882571 -1.208579     -0.023151    -0.702510   
...           ...            ...       ...           ...          ...   
28347   -0.022803      -1.080751 -0.931853     -1.545909    -0.139833   
28348   -0.016961      -0.323050 -1.485305     -0.901138    -0.643025   
28349   -0.018650      -0.180032  0.175052     -0.853123    -0.583539   
28350   -0.021510      -0.690663 -0.931853     -0.187774     0.010344   
28351   -0.021352      -1.563232 -0.101674     -0.345537    -0.677156   

           mode  Song Length  
0      0.876494    -0.520964  
1      0.876494    -1.047386  
2     -1.140909    -0.817917  
3      0.876494    -0.941083  
4      0.876494    -0.614317  
...         ...          ...  
28347  0.876494    -0.363450  
28348  0.876494     2.071787  
28349 -1.140909    -0.269524  
28350  0.876494     2.306102  
28351 -1.140909     1.816058  

[28352 rows x 10 columns]

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 10, shuffle = True)# create cross-tabulation and look at the percentage of each category for the target variable
count_train = pd.crosstab(index = y_train, columns = "Percent")

print("Training Target Variable:")
print()
print(count_train / count_train.sum())
print()

count_test = pd.crosstab(index = y_test, columns = "Percent")

print("Testing Target Variable:")
print()
print(count_test / count_test.sum())
print()
Training Target Variable:

col_0   Percent
row_0          
0      0.547969
1      0.452031

Testing Target Variable:

col_0   Percent
row_0          
0      0.554785
1      0.445215

In [47]:
model2.fit(X_train, y_train)



# Output the y-intercept and regression coefficients.
print("Intercept :" + str(model2.intercept_))
print("Coefficients: " + str(model2.coef_))
print()


# Create a new DataFrame called df_results.
# Add the predictor names and coefficient values to df_results.
# Take the exponents of the coefficients to create the odds ratios.
# Add the odds ratio values to df_results.
my_dictionary = {
    "predictor" : X_train.columns,
    "coefficient": model2.coef_[0]
}
df_results = pd.DataFrame(my_dictionary)

df_results["odds_ratios"] = np.exp(df_results["coefficient"])


# Sort the results by odds ratio descending.
# Output the results.

df_results = df_results.sort_values("odds_ratios", ascending = False)
df_results
Intercept :[-0.0883006]
Coefficients: [[ 6.62847167e-02  1.39628222e-01  3.83794418e-02  2.34756361e+00
   2.16869635e-02  2.24259121e-03  2.94896491e-02 -1.99390399e-02
   1.42820690e-02 -9.62607172e-02]]

Out[47]:
predictor coefficient odds_ratios
3 inv_energy 2.347564 10.460054
1 acoustic_vs_instr 0.139628 1.149846
0 loud_minus_energy 0.066285 1.068531
2 loud_acoustic_product 0.038379 1.039125
6 danceability 0.029490 1.029929
4 valence_tempo 0.021687 1.021924
8 mode 0.014282 1.014385
5 key 0.002243 1.002245
7 speechiness -0.019939 0.980258
9 Song Length -0.096261 0.908227
In [48]:
y_predictions = model2.predict(X_test)


# Compare y_test to y_predictions by creating a new DataFrame called df_compare with y_test in one (1) column and y_predictions in another column.
data = {
    "actual": y_test,
    "predicted": y_predictions
}
df_compare = pd.DataFrame(data)
df_compare.head(10)
Out[48]:
actual predicted
0 1 1
1 0 0
2 0 1
3 1 1
4 1 0
5 0 0
6 1 0
7 0 0
8 1 0
9 1 0
In [49]:
print(classification_report(y_test, y_predictions))
              precision    recall  f1-score   support

           0       0.59      0.77      0.67      4719
           1       0.54      0.34      0.41      3787

    accuracy                           0.58      8506
   macro avg       0.57      0.55      0.54      8506
weighted avg       0.57      0.58      0.56      8506

In [50]:
# Let's consider top 3 positive features and top 3 negative features
# Positive: inv_energy, loudness, acoustic_vs_instr
# Negative: energy, song length, liveness
df_final = df[['inv_energy', 'loudness', 'acoustic_vs_instr', 'energy', 'Song Length', 'liveness','popularity_category']]
In [51]:
model3 = LogisticRegression(max_iter = 200, multi_class = 'multinomial', solver = 'lbfgs')
In [52]:
y = df_final.iloc[:, -1:].values.reshape(-1, )
print("y:", y.shape)
print(type(y))
print()

# create a DF called X with all rows and all columns before target
X = df_final.iloc[:, :-1]
print("X shape: ", X.shape)
print(type(X))
y: (28352,)
<class 'numpy.ndarray'>

X shape:  (28352, 6)
<class 'pandas.core.frame.DataFrame'>
In [53]:
X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns = X.columns)
print(X_scaled)
print()
       inv_energy  loudness  acoustic_vs_instr    energy  Song Length  \
0       -0.022582  1.377883           0.049407  1.185948    -0.520964   
1       -0.018370  0.608875          -0.055407  0.635555    -1.047386   
2       -0.023130  1.115070          -0.020727  1.267690    -0.817917   
3       -0.023094  1.001119          -0.177859  1.262240    -0.941083   
4       -0.019196  0.706689          -0.017865  0.733645    -0.614317   
...           ...       ...                ...       ...          ...   
28347   -0.022803  1.647942          -0.029335  1.218645    -0.363450   
28348   -0.016961  0.775851          -0.274738  0.477521     2.071787   
28349   -0.018650  0.631929           0.068005  0.668251    -0.269524   
28350   -0.021510  1.138453          -0.635961  1.033364     2.306102   
28351   -0.021352  0.739953          -1.323521  1.011566     1.816058   

       liveness  
0     -0.806073  
1      1.065178  
2     -0.519323  
3      0.083685  
4     -0.690604  
...         ...  
28347 -0.796451  
28348  1.180648  
28349 -0.262724  
28350  0.975368  
28351  3.534947  

[28352 rows x 6 columns]

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 10, shuffle = True)
In [55]:
model3.fit(X_train, y_train)



# Output the y-intercept and regression coefficients.
print("Intercept :" + str(model3.intercept_))
print("Coefficients: " + str(model3.coef_))
print()


# Create a new DataFrame called df_results.
# Add the predictor names and coefficient values to df_results.
# Take the exponents of the coefficients to create the odds ratios.
# Add the odds ratio values to df_results.
my_dictionary = {
    "predictor" : X_train.columns,
    "coefficient": model3.coef_[0]
}
df_results = pd.DataFrame(my_dictionary)

df_results["odds_ratios"] = np.exp(df_results["coefficient"])


# Sort the results by odds ratio descending.
# Output the results.

df_results = df_results.sort_values("odds_ratios", ascending = False)
df_results
Intercept :[-0.10304979]
Coefficients: [[ 0.00419707  0.14531476  0.09011132 -0.14829699 -0.09264571 -0.0226359 ]]

Out[55]:
predictor coefficient odds_ratios
1 loudness 0.145315 1.156404
2 acoustic_vs_instr 0.090111 1.094296
0 inv_energy 0.004197 1.004206
5 liveness -0.022636 0.977618
4 Song Length -0.092646 0.911516
3 energy -0.148297 0.862175
In [56]:
y_predictions = model3.predict(X_test)


# Compare y_test to y_predictions by creating a new DataFrame called df_compare with y_test in one (1) column and y_predictions in another column.
data = {
    "actual": y_test,
    "predicted": y_predictions
}
df_compare = pd.DataFrame(data)
df_compare.head(10)
Out[56]:
actual predicted
0 1 1
1 0 1
2 0 1
3 1 0
4 1 0
5 0 1
6 1 0
7 0 0
8 1 0
9 1 0
In [57]:
print(classification_report(y_test, y_predictions))
              precision    recall  f1-score   support

           0       0.60      0.75      0.67      4719
           1       0.55      0.37      0.44      3787

    accuracy                           0.58      8506
   macro avg       0.57      0.56      0.55      8506
weighted avg       0.58      0.58      0.57      8506

In [58]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 28352 entries, 0 to 32832
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     28352 non-null  object 
 1   Title                  28352 non-null  object 
 2   artist                 28352 non-null  object 
 3   Album ID               28352 non-null  object 
 4   Album                  28352 non-null  object 
 5   genre                  28352 non-null  object 
 6   subgenre               28352 non-null  object 
 7   danceability           28352 non-null  float64
 8   energy                 28352 non-null  float64
 9   key                    28352 non-null  int64  
 10  loudness               28352 non-null  float64
 11  mode                   28352 non-null  int64  
 12  speechiness            28352 non-null  float64
 13  acousticness           28352 non-null  float64
 14  instrumentalness       28352 non-null  float64
 15  liveness               28352 non-null  float64
 16  valence                28352 non-null  float64
 17  tempo                  28352 non-null  float64
 18  Song Length            28352 non-null  int64  
 19  Popularity             28352 non-null  int64  
 20  popularity_category    28352 non-null  int64  
 21  Release Year           28352 non-null  object 
 22  loud_minus_energy      28352 non-null  float64
 23  acoustic_vs_instr      28352 non-null  float64
 24  loud_acoustic_product  28352 non-null  float64
 25  inv_energy             28352 non-null  float64
 26  valence_tempo          28352 non-null  float64
dtypes: float64(14), int64(5), object(8)
memory usage: 6.1+ MB
In [59]:
from sklearn.tree import DecisionTreeClassifier
tree_features = df[df.columns[2:]]
tree_features = tree_features[['genre', 'subgenre','danceability', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'Song Length', 'Release Year', 'loud_minus_energy', 'acoustic_vs_instr', 'inv_energy', 'valence_tempo', 'popularity_category']]

columns_to_keep = ['danceability', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'Song Length', 'Release Year', 'loud_minus_energy', 'acoustic_vs_instr', 'inv_energy', 'valence_tempo', 'popularity_category']
scaler = StandardScaler()
numerical_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
                      'instrumentalness', 'liveness', 'valence', 'tempo', 'Song Length']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

genre_dummies = pd.get_dummies(tree_features["genre"], prefix = "genre")
subgenre_dummies = pd.get_dummies(tree_features["subgenre"], prefix = "subgenre")


tree_features = tree_features[columns_to_keep]
In [60]:
# Temporarily remove the target column
popularity_col = tree_features["popularity_category"]
tree_features = tree_features.drop(columns=["popularity_category"])

# Join dummy variables
tree_features = tree_features.join(genre_dummies.loc[:, "genre_2":])
tree_features = tree_features.join(subgenre_dummies.loc[:, "subgenre_2":])
In [61]:
# Add the target column back as the last column
tree_features["popularity_category"] = popularity_col


X = tree_features.iloc[:,:-1]
y = tree_features.iloc[:,-1:].values.reshape(-1,)
In [62]:
X.info()
<class 'pandas.core.frame.DataFrame'>
Index: 28352 entries, 0 to 32832
Data columns (total 46 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   danceability                        28352 non-null  float64
 1   key                                 28352 non-null  int64  
 2   loudness                            28352 non-null  float64
 3   mode                                28352 non-null  int64  
 4   speechiness                         28352 non-null  float64
 5   acousticness                        28352 non-null  float64
 6   instrumentalness                    28352 non-null  float64
 7   liveness                            28352 non-null  float64
 8   valence                             28352 non-null  float64
 9   tempo                               28352 non-null  float64
 10  Song Length                         28352 non-null  int64  
 11  Release Year                        28352 non-null  object 
 12  loud_minus_energy                   28352 non-null  float64
 13  acoustic_vs_instr                   28352 non-null  float64
 14  inv_energy                          28352 non-null  float64
 15  valence_tempo                       28352 non-null  float64
 16  genre_edm                           28352 non-null  bool   
 17  genre_latin                         28352 non-null  bool   
 18  genre_pop                           28352 non-null  bool   
 19  genre_r&b                           28352 non-null  bool   
 20  genre_rap                           28352 non-null  bool   
 21  genre_rock                          28352 non-null  bool   
 22  subgenre_album rock                 28352 non-null  bool   
 23  subgenre_big room                   28352 non-null  bool   
 24  subgenre_classic rock               28352 non-null  bool   
 25  subgenre_dance pop                  28352 non-null  bool   
 26  subgenre_electro house              28352 non-null  bool   
 27  subgenre_electropop                 28352 non-null  bool   
 28  subgenre_gangster rap               28352 non-null  bool   
 29  subgenre_hard rock                  28352 non-null  bool   
 30  subgenre_hip hop                    28352 non-null  bool   
 31  subgenre_hip pop                    28352 non-null  bool   
 32  subgenre_indie poptimism            28352 non-null  bool   
 33  subgenre_latin hip hop              28352 non-null  bool   
 34  subgenre_latin pop                  28352 non-null  bool   
 35  subgenre_neo soul                   28352 non-null  bool   
 36  subgenre_new jack swing             28352 non-null  bool   
 37  subgenre_permanent wave             28352 non-null  bool   
 38  subgenre_pop edm                    28352 non-null  bool   
 39  subgenre_post-teen pop              28352 non-null  bool   
 40  subgenre_progressive electro house  28352 non-null  bool   
 41  subgenre_reggaeton                  28352 non-null  bool   
 42  subgenre_southern hip hop           28352 non-null  bool   
 43  subgenre_trap                       28352 non-null  bool   
 44  subgenre_tropical                   28352 non-null  bool   
 45  subgenre_urban contemporary         28352 non-null  bool   
dtypes: bool(30), float64(12), int64(3), object(1)
memory usage: 4.5+ MB
In [63]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42, shuffle = True)
count_train = pd.crosstab(index = y_train, columns = "Percent")

print("Training Target Variable:")
print()
print(count_train / count_train.sum())
print()

count_test = pd.crosstab(index = y_test, columns = "Percent")

print("Testing Target Variable:")
print()
print(count_test / count_test.sum())
print()
Training Target Variable:

col_0   Percent
row_0          
0      0.551698
1      0.448302

Testing Target Variable:

col_0   Percent
row_0          
0      0.546085
1      0.453915

In [64]:
clf = DecisionTreeClassifier(criterion = "entropy", max_depth = 5)
In [65]:
clf = clf.fit(X_train, y_train)
In [66]:
y_pred = clf.predict(X_test)
In [67]:
data = {
    "actual": y_test,
    "predicted": y_pred
}
df_comp = pd.DataFrame(data)
df_comp.head(30)
Out[67]:
actual predicted
0 1 0
1 0 0
2 1 0
3 0 0
4 0 0
5 1 1
6 0 0
7 0 1
8 0 0
9 1 1
10 0 0
11 1 0
12 0 0
13 1 1
14 0 0
15 0 1
16 1 0
17 1 0
18 0 0
19 0 0
20 1 0
21 0 0
22 1 0
23 1 0
24 1 0
25 0 1
26 1 1
27 1 1
28 0 0
29 1 0
In [68]:
print("Accuracy: ",accuracy_score(y_test,y_pred))
Accuracy:  0.6177992005643076
In [69]:
print("Precision: ",precision_score(y_test, y_pred, pos_label = 1))
Precision:  0.6683222958057395
In [70]:
print("Recall: ", recall_score(y_test, y_pred, pos_label = 1))
Recall:  0.31364931364931364
In [ ]:
 
In [ ]: